************************************************************************
/* This Do File will create instruments and other sample restrictions for 
the data in 
Agan, Doleac, and Harvey "Misdemeanor Prosecution"
It will create the .dta files you need in 2_tablesandfigures

You will need

 *1.  suffolk_base.dta
 *2.  rangestat, downloadable from 

*/
************************************************************************

clear 

* set path to where suffolk_base is
global path "/Users/aagan/Dropbox/Prosecutorial Reform Initiative/Suffolk/Do/QJE Replication Archive"


/****************************************************************
First make the main estimation samples for recidivism within 
1 year/2 year/3 years/4 years/5 years/6 years

Only real difference is the restriction of the latest case date
to allow everyone full recidivism time

We look only after 2003 because that is when the most reliable data is

****************************************************************/
forvalues i=1(1)6 {
  	
use "$path/suffolk_base.dta", clear

if `i'==1{
	global sample="first_event_year>2003 & first_event_date<21793  & muni_district_court==1 & felony_correct_any==0 & first_pros~=. & type_violent==0 & type_weapons==0" 
	
	local length="oneyears"
 }

if `i'==2{
	global sample="first_event_year>2003 & first_event_date<21428  & muni_district_court==1 & felony_correct_any==0 & first_pros~=. & type_violent==0 & type_weapons==0" 
	
	local length="twoyears"
 }

if `i'==3{
	global sample="first_event_year>2003 & first_event_date<21063  & muni_district_court==1 & felony_correct_any==0 & first_pros~=. & type_violent==0 & type_weapons==0" 
	
	local length="threeyears"
 }
 
 if `i'==4{
	global sample="first_event_year>2003 & first_event_date<20698 & muni_district_court==1 & felony_correct_any==0 & first_pros~=. & type_violent==0 & type_weapons==0" 
	
	local length="fouryears"
 }
 
  if `i'==5{
	global sample="first_event_year>2003 & first_event_date<20332 & muni_district_court==1 & felony_correct_any==0 & first_pros~=. & type_violent==0 & type_weapons==0" 
	
	local length="fiveyears"
 }
 

 if `i'==6 {
 		global sample="first_event_year>2003 & first_event_date<19967  & muni_district_court==1 & felony_correct_any==0 & first_pros~=. & type_violent==0 & type_weapons==0" 
	
	local length="sixyears"
 	
 }

/* Covariates*/

global covars " number_counts_correct number_misd_correct number_misd_high anyconv_misd_oneyearpriorc  anyconv_felony_oneyearpriorc citizen  type_pettybs_only type_mv_only type_drug_only male  age2 age3 age4 predwhi predbla predhis"


/* FE  */
global FE="court_month2 court_dow2"
global timeFE="court_month2"

/* Set Clusters */
global clusters="id_prsn_dfndnt first_pros"


**************************************
/* Create the Various Instruments */
**************************************

**1. Main Pooled Instrument

*residualizing first stage outcome:
qui reghdfe ng_immed_all  if $sample, absorb($FE) resid
qui  predict resid if  $sample, resid

* for removing all of a defendant's observations from instrument
sort id_prsn_dfndnt  first_pros
qui by id_prsn_dfndnt first_pros: egen i_obs = count(resid) if $sample
qui by id_prsn_dfndnt first_pros: egen i_resid = mean(resid) if  $sample

* create main ADA iv variable: residualized mean not guilty at arraignment for ADA, remove defendants observations from instrument
qui bys first_pros : egen tmp_mean = mean(resid) if $sample
qui bys first_pros : egen tmp_obs = count(resid) if $sample
qui gen ADA_iv= (tmp_mean*tmp_obs - (i_obs*i_resid)) / (tmp_obs - i_obs)  if $sample

drop tmp* resid i_*
    
label var ADA_iv "Not Prosecuted"
**2. ADA_iv without residualizing

* for removing all of a defendant's observations from instrument
sort id_prsn_dfndnt  first_pros
qui by id_prsn_dfndnt first_pros: egen i_obs = count(ng_immed_all) if $sample
qui by id_prsn_dfndnt first_pros: egen i_resid = mean(ng_immed_all) if  $sample

* create raw DA iv variable: mean not guilty at arraignment for ADA, remove defendants observations from instrument
qui bys first_pros : egen tmp_mean = mean(ng_immed_all) if $sample
qui bys first_pros : egen tmp_obs = count(ng_immed_all) if $sample
qui gen ADA_iv_noresid= (tmp_mean*tmp_obs - (i_obs*i_resid)) / (tmp_obs - i_obs)  if $sample

drop tmp* i_*

**3. Instrument x crime type

qui foreach var of varlist  type_mv_only type_drug_only type_pettybs_only type_uncat_only {
	
	*residualizing first stage outcome:
	reghdfe ng_immed_all if $sample  & `var'==1, absorb($FE) resid
	predict resid if  $sample  & `var'==1 , resid
	
	* for removing all of a defendant's observations from instrument
	sort id_prsn_dfndnt  first_pros 
	by id_prsn_dfndnt  first_pros: egen i_obs = count(resid) if $sample  & `var'==1
	by id_prsn_dfndnt  first_pros: egen i_resid = mean(resid) if  $sample  & `var'==1
	
	* create ADA iv variable
	bys first_pros : egen tmp_mean = mean(resid) if $sample  & `var'==1
	bys first_pros : egen tmp_obs = count(resid) if $sample  & `var'==1
	gen ADA_iv_`var' = (tmp_mean*tmp_obs - (i_obs*i_resid)) / (tmp_obs - i_obs)  if $sample & `var'==1
	
	drop tmp* i_* resid
	
}

qui  gen ADA_iv_crime=ADA_iv_type_mv_only if $sample  & type_mv_only==1
replace ADA_iv_crime=ADA_iv_type_drug_only if $sample  & type_drug_only==1
replace ADA_iv_crime=ADA_iv_type_pettybs_only if $sample  & type_pettybs_only==1
replace ADA_iv_crime=ADA_iv_type_uncat_only if $sample  & type_uncat_only==1

**4. Instrument x victim

gen victim=victimless==0

foreach var of varlist  victim victimless {
	
	*residualizing first stage outcome:
	reghdfe ng_immed_all if $sample  & `var'==1, absorb($FE) resid
	predict resid if  $sample  & `var'==1 , resid
	
	* for removing all of a defendant's observations from instrument
	sort id_prsn_dfndnt  first_pros 
	bys id_prsn_dfndnt  first_pros: egen i_obs = count(resid) if $sample  & `var'==1
	bys id_prsn_dfndnt  first_pros: egen i_resid = mean(resid) if  $sample  & `var'==1
	
	* create ADA iv variable
	bys first_pros : egen tmp_mean = mean(resid) if $sample  & `var'==1
	bys first_pros : egen tmp_obs = count(resid) if $sample  & `var'==1
	gen ADA_iv_`var' = (tmp_mean*tmp_obs - (i_obs*i_resid)) / (tmp_obs - i_obs)  if $sample & `var'==1
	
	drop tmp* i_* resid
	
	
}

gen ADA_iv_vic=ADA_iv_victim if $sample & victim==1
replace ADA_iv_vic=ADA_iv_victimless if $sample & victimless==1

**5. Instrument x ADA experience

* first generate experience x crime type x misdemeanor (exp measured as number of cases arraigned)
sort first_pros felony_correct_any type_violent  first_event_date
by first_pros felony_correct_any type_violent:  gen pros_exper=_n
sum pros_exper if $sample, det
gen high_pros_exper=pros_exper>=`r(p50)'
gen low_pros_exper=pros_exper<`r(p50)'


foreach var of varlist  high_pros_exper low_pros_exper {
	
	*residualizing first stage outcome:
	reghdfe ng_immed_all if $sample  & `var'==1, absorb($FE) resid
	predict resid if  $sample  & `var'==1 , resid
	
	* for removing all of a defendant's observations from instrument
	sort id_prsn_dfndnt  first_pros 
	by id_prsn_dfndnt  first_pros: egen i_obs = count(resid) if $sample  & `var'==1
	by id_prsn_dfndnt  first_pros: egen i_resid = mean(resid) if  $sample  & `var'==1
	
	* create ADA iv variable
	bys first_pros : egen tmp_mean = mean(resid) if $sample  & `var'==1
	bys first_pros : egen tmp_obs = count(resid) if $sample  & `var'==1
	gen ADA_iv_`var' = (tmp_mean*tmp_obs - (i_obs*i_resid)) / (tmp_obs - i_obs)  if $sample & `var'==1
	
	drop tmp* i_* resid
	
	
}

gen ADA_iv_exper=ADA_iv_high_pros_exper if $sample & high_pros_exper==1
replace ADA_iv_exper=ADA_iv_low_pros_exper if $sample & low_pros_exper==1


**6. Instrument using "no bail" as outcome instead

gen nobail=(ng_immed_all==1 | bail_requested_arr==0)

*residualizing first stage outcome:
reghdfe nobail if $sample, absorb($FE) resid
cap drop resid 
predict resid if  $sample, resid

sort id_prsn_dfndnt  first_pros 
by id_prsn_dfndnt  first_pros: egen i_obs = count(resid) if $sample  
by id_prsn_dfndnt  first_pros: egen i_resid = mean(resid) if  $sample  
	
bys first_pros : egen tmp_mean = mean(resid) if $sample
bys first_pros : egen tmp_obs = count(resid) if $sample

gen bail_iv= (tmp_mean*tmp_obs - (i_obs*i_resid)) / (tmp_obs - i_obs)  if $sample

drop tmp* resid i_*

** 7. Shrunken version of IV estimate
cap drop resid
qui reghdfe ng_immed_all  if $sample, absorb($FE) resid
qui  predict resid if  $sample, resid

* variance of the distribution of leniency
qui sum ADA_iv if $sample, det
qui gen var_ADA_iv=r(Var)

gen sample=$sample

* within prosecutor variance
qui bys first_pros sample: egen sd=sd(resid) 
qui bys first_pros sample: egen n_pros_sample=count(id_cs)
qui gen var_i=(sd/sqrt(n_pros_sample))^2 if $sample

*shrinkage factor
qui gen R=(var_ADA_iv/(var_ADA_iv + var_i))

qui gen ADA_iv_shrink=ADA_iv*R
drop sample


   
**************************************
/* Additional Sample Restrictions */
**************************************

* only want to use observations where proseuctor sees 30 or more cases in a year
* note n_prosyear needs to have a suffix that matches the IV type you are using
bys first_pros : egen n_pros=count(ADA_iv) if $sample
bys first_pros : gen n=_n  
sum n_pros if n==1 & $sample, d
global sample = "$sample" + " & n_pros>=30"


* missing vars on the demographics 

gen any_missing=0
foreach var of global covars {
	replace any_missing=1 if mi(`var')
}

global sample = "$sample" + " & any_missing==0"


*  We are using court x dow FE.  There are singleton observations within our sample
cap drop insample singleton
gen insample=$sample
bys $timeFE insample: gen singleton=(_N==1)
tab singleton if $sample

global sample = "$sample" + " & singleton!=1"

* create a sample variable
gen sample=1 if $sample


**************************************
/* Additional Variables */
**************************************
gen mon_week=first_event_date-mod(dow(first_event_date)-1, 7)
egen court_monweek2=group(mon_week code_crt_lctn_case)

**************************************
/* Save */
**************************************

  
  * in the one year sample create a version for the rollins analysis that doesn't drop felonies or people missing first pros (since doesn't depend on that)
if "`length'"=="oneyears"{
	preserve
	*making some new vbles we need for follins analysis

	gen juv_court=0
	replace juv_court=1 if code_crt_lctn_case=="CHEJ" | code_crt_lctn_case=="DORJ" | code_crt_lctn_case=="JUV" | code_crt_lctn_case=="WRXJ" 
	
	* post rachael inauguration
	gen post_rachael=0
	replace post_rachael=1 if first_event_date>=21551 

	gen post_rachael_rachael=post_rachael*rachael_crime

	gen nv_misd=0
	replace nv_misd = 1 if type_violent==0 & type_weapons==0
	replace nv_misd = 0 if felony_correct_any==1

	gen post_rachael_nv=post_rachael*nv_misd

	gen nvmisd_notrachael=0
	replace nvmisd_notrachael = 1 if felony_correct_any==0 & type_violent==0 & type_weapons==0 & rachael_crime==0

	gen post_rachael_notrachael= nvmisd_notrachael* post_rachael

	label var post_rachael "Post Rollins"
	label var post_rachael_nv "Post Rollins x NV Misd"

	gen dummy=1

	egen court_location_case=group(code_crt_lctn_case)

	global sample2 =  "first_event_year>2017 & first_event_date<21794 & juv_court==0  & type_violent==0 & type_weapons==0 & any_missing==0"
	keep if $sample2
	save "$path/suffolk_est_`length'_demos_rollins.dta", replace
	restore
}

* save a version just in main sample - this will be main data
preserve
keep if sample==1
drop sample
save "$path/suffolk_est_`length'_demos.dta", replace
restore

* save a version that doesn't condition on not missing first pros or demographics 
* for later analysis

if "`length'"=="twoyears"{
	keep if first_event_year>2003 & first_event_date<21428  & muni_district_court==1 & felony_correct_any==0 & type_violent==0 & type_weapons==0
	save "$path/suffolk_est_`length'_demos_inclmissingfp.dta", replace
}



* end loop

}

/****************************************************************
Create imputation samples
For missing ADAs
****************************************************************/

use "$path/suffolk_base.dta", clear

  * in the "two year recidivsm sample" but since we are imputing we need to NOT include first_pros~=0 here
global sample_np "first_event_year>2003 & first_event_date<21428  & muni_district_court==1 & felony_correct_any==0  & type_violent==0 & type_weapons==0" 

keep if $sample_np


* Make weeks starting on Monday
gen mon_week=first_event_date-mod(dow(first_event_date)-1, 7)

* count the number of prosecutors at the court-week-day level
egen tag=tag(first_pros code_crt_lctn_case first_event_date)
bys code_crt_lctn_case first_event_date: egen n_pros_cwd=total(tag)
drop tag

* number of cases on that day
bys first_event_date code_crt_lctn_case: egen n_cwd=count(first_event_date)

* number of cases fr that pros on that day
bys first_event_date code_crt_lctn_case first_pros: egen n_cwd_forpros=count(first_event_date)

* number not missing pros on that day
bys first_event_date code_crt_lctn_case: egen n_haspros=count(first_pros)

* number for the week
egen tag=tag(first_pros code_crt_lctn_case mon_week)
bys code_crt_lctn_case mon_week: egen n_pros_week=total(tag)
drop tag

* how many days in a week does a prosecutor work?
egen tag=tag(first_pros code_crt_lctn_case mon_week first_event_dow)
bys code_crt_lctn_case mon_week: egen n_pros_week_days=total(tag)
bys code_crt_lctn_case mon_week: egen n_haspros_week=count(first_pros)
bys code_crt_lctn_case mon_week: egen temp=max(first_pros)


/*** IMPUTATIONS **/

/*  IMPUTATION 1:  "Most Restrictive"
DAY: Replace if there is only one other prosecutors who prosecutres cases on that day
AND they prosecuted more than 1 other case OR if there was only one other case

WEEK: Replace if whole day is missing but only one other pros works that week 
and that other pros works at least 2 days */

gen first_pros_imp1=first_pros
cap drop temp

  * DAY RULE:
bys first_event_date code_crt_lctn_case: egen temp=max(first_pros)
replace first_pros_imp1=temp if mi(first_pros) & n_pros_cwd==1 & (n_haspros>1 | (n_haspros==1 & n_cwd==2))
drop temp

  * WEEK RULE:
bys code_crt_lctn_case mon_week: egen temp=max(first_pros)
replace first_pros_imp1=temp if mi(first_pros) & n_pros_week==1 & n_haspros_week>2 & n_pros_week_days>1
drop temp

/*  IMPUTATION 2:  Build on imputation 1, but replace if just one other pros on that day, or in that week
DAY: Replace if there is only one other prosecutors who prosecutres cases on that day


WEEK: Replace if there is only one other pros who prosecutes that week*/

gen first_pros_imp2=first_pros_imp1

  * DAY RULE:
bys first_event_date code_crt_lctn_case: egen temp=max(first_pros)
replace first_pros_imp2=temp if mi(first_pros_imp2) & n_pros_cwd==1 
drop temp

  * WEEK RULE:
bys code_crt_lctn_case mon_week: egen temp=max(first_pros)
replace first_pros_imp2=temp if mi(first_pros_imp2) & n_pros_week==1 
drop temp

	/* IMPUTATION 3:  Build on imputation 2, replace remaining with modal prosecutor for that court-day
   (if no mode, no replace) */
gen first_pros_imp3=first_pros_imp2
   * note egen mode generates missing if there are multiple modes, which I believe is the behavior we should want
bys first_event_date code_crt_lctn_case: egen mpros=mode(first_pros)
replace first_pros_imp3=mpros if mi(first_pros_imp3) & !mi(mpros)
drop mpros

/* IMPUTATION 4:  Build in imputation 3, replace remaining with modal prosecutor for that court-week  */

gen first_pros_imp4=first_pros_imp3
    
   * note egen mode generates missing if there are multiple modes, which I believe is the behavior we should want
bys code_crt_lctn_case mon_week: egen mpros=mode(first_pros)
replace first_pros_imp4=mpros if mi(first_pros_imp4) & !mi(mpros)
drop mpros

/******************** CREATING INSTRUMENT ********************/

* CHOOSE FE 
global FE="court_month2 court_dow2"

   * put court month or week here to drop the right singletons
global timeFE="court_month2"

* CHOOSE CLUSTERS
global clusters="id_prsn_dfndnt first_pros"

* covariates 

 global covars " number_counts_correct number_misd_correct number_misd_high anyconv_misd_oneyearpriorc  anyconv_felony_oneyearpriorc citizen  type_pettybs_only type_mv_only type_drug_only male  age2 age3 age4  predwhi predbla predhis"

** First set sample for those not missing first pros to create the ADA_iv
global sample "first_event_year>2003 & first_event_date<21428  & muni_district_court==1 & felony_correct_any==0 & first_pros~=. & type_violent==0 & type_weapons==0"


**1. Main Pooled Instrument

*residualizing first stage outcome:
qui reghdfe ng_immed_all  if $sample, absorb($FE) resid
qui  predict resid if  $sample, resid

* for removing all of a defendant's observations from instrument
sort id_prsn_dfndnt  first_pros
qui by id_prsn_dfndnt first_pros: egen i_obs = count(resid) if $sample
qui by id_prsn_dfndnt first_pros: egen i_resid = mean(resid) if  $sample

* create main ADA iv variable: residualized mean not guilty at arraignment for ADA, remove defendants observations from instrument
qui bys first_pros : egen tmp_mean = mean(resid) if $sample
qui bys first_pros : egen tmp_obs = count(resid) if $sample
qui gen ADA_iv= (tmp_mean*tmp_obs - (i_obs*i_resid)) / (tmp_obs - i_obs)  if $sample

drop tmp* resid i_*

sum ADA_iv, det



cap drop temp


/******************** Sample Restrictions ********************/

* only people w/ at least 30 observations

bys first_pros : egen n_pros=count(ADA_iv) if $sample
bys first_pros : gen n=_n  
sum n_pros if n==1 & $sample, d
global sample = "$sample" + " & n_pros>=30"


* missing vars on the demographics 

gen any_missing=0
foreach var of global covars {
	replace any_missing=1 if mi(`var')
}

global sample = "$sample" + " & any_missing==0"


cap drop insample singleton
gen insample=$sample
bys $timeFE insample: gen singleton=(_N==1)
tab singleton if $sample

global sample = "$sample" + " & singleton!=1"



*** give the newly imputed people the leniency of their assigned ADA, calculated amongst the cases we know for sure first pros
bys first_pros_imp4: egen temp=mean(ADA_iv)
gen ADA_iv_imp=ADA_iv
replace ADA_iv_imp=temp if mi(ADA_iv_imp)
drop temp
bys first_pros_imp4: egen temp=mean(n_pros)
gen n_pros_imp=n_pros
replace n_pros_imp=temp if mi(n_pros_imp)
drop temp


*** save ****

save "$path/suffolk_est_twoyears_imputation.dta", replace
